Preamble

Dependencies

saving = FALSE
#' Plot the ggplot for the percentage composition for different class in different 
#' datasets. 
#' 
#' @param seuratObject the seurat object containing the data
#' @param classLabel string indicating which metadata column in seurat object is for 
#'    the class label (y-axis) for the tile plot
#' @param dataLabel string indicating the metadata column for dataset label (x-axis)
#' @param colorOption color option fot viridis. default is "G" but can change to
#'    other letter 
#'    
#' @author Jieran Sun

plotDataComposition <- function(seruatObject, classLabel, dataLabel, colorOption = "G") {
  ca <- data.table(table(cluster=unlist(seruatObject[[classLabel]]), 
                   sample=unlist(seruatObject[[dataLabel]])))
  ca[, percent := 100*(N/sum(N)), by = sample]
  
  plot_cluster_comp <- ggplot(as.data.frame(ca), aes(sample, cluster, fill=percent)) + 
    geom_tile() + 
    scale_fill_gradientn(colors = viridis_pal(option = colorOption, end = 0.95, direction = -1)(10)) +
    geom_text(aes(label= round(percent, digits = 2)) , color = "white") + 
    coord_fixed(ratio = 0.25) +
    guides(fill = guide_colourbar(barwidth = 0.5,barheight = 18))
}

#' Plot a heatmap object using sechm package to indicate the expression level for 
#' different marker genes with their corresponding cell type annotation
#' 
#' @param sceObject the singleCellExperiment object to input
#' @param markerList the list of marker genes for plotting. The marker gene list 
#'    should have corresponding cell type as their name. A typical list should 
#'    contain several array objects for marker genes with each of them named by the 
#'    corresponding cell types. 
#' @param cellType string or list of string indicating which cell type(s) are used for 
#'    plotting
#' @param heatmapColor color option for heatmap in viridis
#' @param gaps_col if adding column gaps in the heatmap (in the case of plotting
#'    merged sceObject with different column indication)
#'    
#' @author Jieran Sun

plotSpecificHeatmap <- function(sceObject, markerList, cellType = NULL, heatmapColor = "C", gaps_col = NULL){
  
  if (is.null(cellType) == FALSE) {
    markerList <- markerList[cellType]
  }
  # check markerList 
  if (is.null(names(markerList))) {
    names(markerList) <- paste("list", seq_along(markerList), sep = "-")
  }
  
  # Assign a new metadata called cell type to assign the markers on them
  assayNames(sceObject) <- "logcounts"
  rowData(sceObject)$cellType <- NA
  rowData(sceObject)[unlist(markerList),"cellType"] <- rep(names(markerList),lengths(markerList))
  
  # Generate the heatmap based on the selection
  geneToPlot <- unlist(markerList)
  sechm::sechm(sceObject, geneToPlot, 
             assayName = "logcounts", gaps_row = "cellType", gaps_at = gaps_col,
             show_colnames = TRUE, do.scale = TRUE, na_col = "black",
             breaks=1, row_title_rot=0, show_rownames = TRUE, 
             hmcols = viridis_pal(option = heatmapColor)(100))
}

#' Adjusting the marker list assignment based on the heatmap expression. The heatmap
#' expression is used as the heatmap counts are all normalized on a cluster level.
#' By defining a threshold for the assignment, the function helps rearranging the 
#' marker gene from the one cell type (oldCellType) to another (adjustedCellType)
#' 
#' @param heatmapObject the heatmap to use for adjusting the markergene assignment
#' @param cluster which cluster expression is used to adjust the marker gene 
#'    assignment level. it can be a string or a vector of string indicating the 
#'    name of the cluster. If it is a vector of string, please make sure it have 
#'    the same length as threshold, oldCellType and adjustedCellType 
#' @param threshold double or vector of double indicating the threshold of expression 
#'    for cluster reassignment
#' @param oldCellType/adjustedCellType string or vector of string for the change of
#'    marker gene assignment to cell type lists. should have the same length. names
#'    in adjusted cell type should exist in the markerList
#' @param visualization plot the histogram for the expression level in the cluster
#' @param saving save the new marker list
#' @param filePath if save the new marker list, what is the file path 
#' 
#' @author Jieran Sun

AdjustMarkerAssignment <- function(heatmapObject, markerList, cluster, 
                                   threshold, oldCellType, adjustedCellType, 
                                   visualization = FALSE, saving = FALSE, filePath = NULL){
  newMarkerList <- markerList
  # TODO: Check if the dimension agrees
  if (length(cluster) > 1 & length(threshold) == 1) {
    threshold <- rep(threshold, length(cluster))
  } 
  
  geneOfInterest <- lapply(seq_along(cluster), function(ind) {
    heatmapObject@row_names_param$labels[which(heatmapObject@matrix[,cluster[ind]] > threshold[ind])]
  })
  for (geneList in geneOfInterest) {
    for (gene in geneList) {
      if (gene %in% markerList[oldCellType]) {
        newMarkerList[adjustedCellType] <- append(newMarkerList[adjustedCellType], gene)
        newMarkerList[oldCellType] <- newMarkerList[oldCellType][-which(gene == newMarkerList[oldCellType])]
      }}}
  
  if (saving) {
    cellType <- names(newMarkerList)
    MarkerDF <- data.frame(cellType = cellType,
                      markers  = I(unlist(lapply(newMarkerList, paste, collapse=","))))
    write.csv(MarkerDF, file=filePath, quote=FALSE, row.names=FALSE)
  }
  
  return(newMarkerList)
}

#' Plot the overall heatmap for the singleCellExperiment Object with different 
#' colors indicating different cell type and a legend for information (optional)
#' 
#' @param sceObject the singleCellExperiment object to plot
#' @param markerList the list of marker genes for plotting. The marker gene list 
#'    should have corresponding cell type as their name. A typical list should 
#'    contain several array objects for marker genes with each of them named by the 
#'    corresponding cell types. 
#' @param metaData an optional list corresponding the cell type acronym to the 
#'    full name. metaData file is essentially a vector of character (full names)
#'    with the acronym as the name for the characters. the full name will be used 
#'    for the legend. If the metaData is NULL the system will just use the acronym 
#'    from the markerlist as the legend name 
#' @param showAll do we plot all the cell types in the markerList?
#' @param cellType only if showAll is FALSE, we specify marker genes from which 
#'    cell type to plot 
#' @param heatmapColor color options from viridis
#' @param clusterRow do we cluster the row? if so the makerGene-cellType correspondence
#'    cannot be maintained 
#' @param clusterCOl do we cluster the column such that similar cell type/cluster are 
#'    plot next to each other
#' @param showLegend if we want to show legend
#' 
#' @author Jieran Sun
#' 
#' Notice: no. of cell type/legend <= 16! (for color plotting purpose)

showOverallHeatmap <- function(sceObject, markerList, metaData, showAll = TRUE, cellType = NULL, 
                               heatmapColor = "C", clusterRow = FALSE, clusterCol = TRUE,
                               showLegend = TRUE) {
  if (showAll != TRUE) {
    if (is.null(cellType)) {
      stop("Please specify cell types to show.")
    }
    markerList <- markerList[cellType]
  }
  
  if (showLegend == TRUE) {
    # Generate heatmap block and anntoation color
    if(is.null(metaData)) {
      cellname <- names(markerList)
    } else {
      cellname <- c(metaData[names(markerList)])
    }
    
    # Set up the color for the heatmap 
    legendColors <- pal_simpsons()(length(cellname))
    names(legendColors) <- cellname
    legendColors <- list(type = legendColors)
    
    # Set up the annotation matrix
    annoDF <- data.frame(row.names=unlist(markerList), 
               type=rep(cellname, lengths(markerList)))
    
    pheatmap(assay(sceObject)[unlist(markerList),], 
               color = viridis_pal(option = heatmapColor)(100),
               annotation_row = annoDF, 
               annotation_colors = legendColors,
               gaps_row = cumsum(lengths(markerList)),
               split=rep(cellname, lengths(markerList)), 
               cluster_rows=clusterRow, cluster_cols = clusterCol,
               scale="row")
  } else {
    pheatmap(assay(sceObject)[unlist(markerList),], 
               color = viridis_pal(option = heatmapColor)(100),
               gaps_row = cumsum(lengths(markerList)),
               split=rep(cellname, lengths(markerList)), 
               cluster_rows=clusterRow, cluster_cols = clusterCol,
               scale="row")
  }
}

#' Find the top expression genes in each cluster/cell type in the sceObject
#' 
#' @param sceObject singleCellExperiment object. The object should be aggregated 
#'    already with column indicating cell types or clusters 
#' @param markerlist the list of marker genes for plotting. The marker gene list 
#'    should have corresponding cell type as their name. A typical list should 
#'    contain several array objects for marker genes with each of them named by the 
#'    corresponding cell types. 
#' @param n number of top expression gene to return 
#' @param allMarker do we return all the marker gene on the list?
#' @param cellType if allMarker is FALSE, which cell type of genes should we return
#' 
#' @return a data table with column as the cluster/cell type depending on the column of
#'    the sceObject. And n number of rows indicating which genes are the most expressing
#' 
#' @author Jieran Sun

findTopGenes <- function(sceObject, markerlist, n = 20, allMarker = TRUE, cellType = NULL){
  if (allMarker != TRUE) {
    if (is.null(cellType)){
      stop("Specify the cell type for gene return! /n")
    }
    markerlist <- unlist(markerlist[cellType])
  }
  topMat <- data.table(assay(sceObject)[unlist(markerlist),], keep.rownames = TRUE)
  topMat <- topMat[, lapply(.SD, function(x){
                            Best_n <- sort(x, index.return=TRUE, decreasing=TRUE)
                            rn[Best_n$ix[1:n]]
                          }), .SDcols = colnames(topMat)[-1]]
  return(topMat)
}
 
#' Plot a list of Violin plot/Ridge plot/Heatmap plot given a list of genes. 
#' 
#' @param conutObject either the seurat object (for violin and ridge plot) or the
#'    sceObject (for the heatmap plot)
#' @param feature_list essentially the markerList in the previous functions
#' @param plot_type type of plot to return, can be Violin plot ("Vln"), ridge 
#'    plot ("Rdg"), and heatmap ("Heatmap")
#' @param group_var  columns on the metadata file in the seurat object indicating 
#'    if we're using clusters or cell types (for heatmap it can be NULL, but it 
#'    need to be used for the pdf generated if saving is TRUE, so maybe just 
#'    keep it as the same way)
#' @param saving if we want to save the final results
#' 
#' @return a list of plots based on the parameters


plotMarkerList <- function(countObject, feature_list, plot_type = c("Vln", "Rdg", "Htm"), 
                           group_var = c("orig.ident", "seurat_clusters","label_calibrated","label_auto"), 
                           saving = FALSE){
  
  if (is.list(feature_list[[1]]) != TRUE) {
    feature_list <- list(feature_list)
  }
  
  Plot_list <- lapply(seq_along(feature_list), function(ind) {
    feature <- feature_list[[ind]]
    if (plot_type == "Vln") {
      Plot <- VlnPlot(countObject, features = feature, 
                group.by = sprintf("%s", group_var), cols = pal_simpsons()(16),pt.size = 0) + 
        plot_annotation(title = sprintf("list_%s", ind))
    } else if (plot_type == "Rdg") {
      Plot <- RidgePlot(countObject, features = feature, 
                group.by = sprintf("%s", group_var), cols = pal_simpsons()(16)) + 
        plot_annotation(title = sprintf("list_%s", ind))
    } else if (plot_type == "Htm") {
      Plot <- sechm::sechm(countObject, feature, 
             assayName = "logcounts", cluster_cols = FALSE,
             show_colnames = TRUE, do.scale = TRUE, na_col = "black",
             breaks=1, row_title_rot=0, show_rownames = TRUE,
             hmcols = viridis_pal(option = "C")(50))
    }
      return(Plot)
  })
  
  if (saving == TRUE) {
    label <- strsplit(group_var, "[.]|_")[[1]][2]
    pdf(file.path(working_path, sprintf("Visualization/Markers_%s_%s.pdf", plot_type, label)), 
        width = 20, height = 20)
    print(Plot_list)
    dev.off()
  }
  
  return(Plot_list)
}

Data creation and input

# three different samples: 100-WT, 101 - GpnmbKO, 102-GpnmbKO)
seurat_list <- lapply(list.files( data_path, pattern = "*.h5", full.name =TRUE), 
  function(file){
    file_ind <- which(list.files( data_path, pattern = "*.h5", full.name =TRUE) == file)
    seurat_object <- Seurat::Read10X_h5(file , use.names = TRUE, unique.features = TRUE) %>% 
      CreateSeuratObject(project = c("WT", "GpnmbKO_1", "GpnmbKO_2")[file_ind], min.cells = 3, min.features = 200)
    seurat_object$stim <- sprintf("cond%s", file_ind-1)
    seurat_object[["percent.mt"]] <- PercentageFeatureSet(seurat_object, pattern = "^mt-")
    seurat_object[["percent.rb"]] <- PercentageFeatureSet(seurat_object, pattern = "^Rp[sl]")
    seurat_object
})

scRNA processing, integration and clustering

# Leaving only the intersected genes
seurat_list <- lapply(seurat_list, function(x) rownames(x)) %>% 
  {Reduce(intersect, .)} %>%
  {lapply(seurat_list, function(x) x[.,])}

# Roughly merged the genes together
seurat_combi <- merge(seurat_list[[1]], seurat_list[-1], project = "CombinedSeurat") 
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
rm(seurat_list)

Plot Vplot for QC visualization

(VP_Overall <- VlnPlot(seurat_combi, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb"), 
        group.by = "orig.ident", cols = pal_npg("nrc")(9),
        ncol = 4, pt.size = 0) + xlab("conditions") )

# QC processing (remove ribosome and mitochrondial RNA)
# try 7 next time? rb was 10 now 15
mt <- 7
rb <- 15
seurat_combi <- subset(seurat_combi, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < mt & percent.rb < rb ) 

# Recheck the violin plot after QC
(VP_afterQC <- VlnPlot(seurat_combi, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb"), 
        group.by = "orig.ident", cols = pal_npg("nrc")(9),
        ncol = 4, pt.size = 0) + xlab("conditions"))

ver = FALSE
set.seed(2023)
# Normalize, scale and run PCA on the seurat object
seurat_combi <- seurat_combi %>%  
    NormalizeData(verbose = ver) %>%
    FindVariableFeatures(nfeatures = 3000, verbose = ver)  %>%
    ScaleData(verbose = ver) %>%
    RunPCA(npcs = 50, verbose = ver)

#' Select essential PCs for clustering 
#' PC is selected based on 2 criteria, either the no. of pcs can already 
#' expalined 80% of the variables and the current one falls under 5%, or 
#' the acculumated change in the variance explained in PC fall under 0.1%

pct <- seurat_combi[["pca"]]@stdev / sum(seurat_combi[["pca"]]@stdev) * 100
choice1 <- which(cumsum(pct) > 80 & pct < 5)[1]
choice2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1 & cumsum(pct) > 60 ), decreasing = T)[1] + 1
## Warning in (pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1 & cumsum(pct) >
## : longer object length is not a multiple of shorter object length
pcs <- min(choice1, choice2, 40)

# Plot an elbow plot to visualize the selection (Sanity check)
Seurat::ElbowPlot(seurat_combi, ndims= 50) + ylab("variance explained (%)") +
                geom_point(x = pcs, y = seurat_combi[["pca"]]@stdev[pcs] , colour = "red") +
                geom_label(
                  label=sprintf("PC selected: %1.0f",pcs), 
                  x=pcs,
                  y=seurat_combi[["pca"]]@stdev[pcs] + 1,
                  label.padding = unit(0.55, "lines"), # Rectangle size around label
                  label.size = 0.35,
                  color = "black",
                  fill="#69b3a2")

set.seed(2023)
# Run Harmony integration on the dataset, and find the clusters
seurat_combi <- RunHarmony(seurat_combi, group.by.vars = "orig.ident", 
                           dims.use = 1:pcs, max.iter.harmony = 50, verbose = ver)
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
seurat_combi <- FindNeighbors(seurat_combi, reduction = "harmony", dims = 1:pcs, verbose = ver)
seurat_combi <- FindClusters(seurat_combi, resolution = 0.6, verbose = ver) # used to be 0.5

# Run UMAP based on the clsuter information
seurat_combi <- RunUMAP(seurat_combi, reduction = "harmony", dims = 1:pcs, verbose = ver)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# UMAP Plot colored based on clusters
Plot1 <- DimPlot(seurat_combi, group.by = "seurat_clusters", label = TRUE, label.color = "black", pt.size = 0.3,  cols = pal_simpsons()(16))  + plot_annotation(title="Clusters") + NoAxes()

# UMAP plot colored based on data sources
Plot2 <- DimPlot(seurat_combi, group.by = "orig.ident",pt.size = 0.3, cols = pal_simpsons()(3)) + 
  plot_annotation(title="Conditions") + NoAxes()

# Visualize the two plots
(plot_UMAP <- Plot1 + Plot2)

(plot_clusterComp <- plotDataComposition(seruatObject = seurat_combi, classLabel = "seurat_clusters", dataLabel = "orig.ident"))

Import marker gene list

Here we plot the heatmap for all the marker genes found in the dataset, regardless of their expression level.

old_way_loading = FALSE

if (old_way_loading){
  
  # Import list of marker genes 
  marker_list <- readxl::read_excel(file.path(working_path,"Markers/Marker_list_BU.xlsx"))[-10]
  
  Meta_Data <- unlist(readxl::read_excel(file.path(working_path,"Markers/Marker_list_BU.xlsx"))[10])
  meta_data <- unlist(data.table::tstrsplit(Meta_Data, ":", fixed = TRUE, keep = 2))
  names(meta_data) <- unlist(tstrsplit(Meta_Data, ":", fixed = TRUE, keep = 1))
  meta_data <- meta_data[!is.na(meta_data)]
  
  # Filter and leave only those marker genes existed in the cluster
  exist_list <- lapply(marker_list, function(x){
    x <- x[!is.na(x)]
    x <- x[x %in% row.names(seurat_combi)]})
  
} else{
  # Extract the data directly from the system
  exist_list <- readRDS("Markers/marker_list.rds")
  meta_data  <- exist_list[[length(exist_list)]]
  exist_list <- lapply(exist_list[-length(exist_list)], function(x){
    x <- x[!is.na(x)]
    x <- x[x %in% row.names(seurat_combi)]})
}
#' If we plot the overall heatmap (The process is time-consuming and require computing power), notice 
#' the large heatmap only provide limited information and is very time-consuming. By default we don't
#' run this heatmap but if you want a detailed overview of the dataset/dataset is relatively small then 
#' you can use it for an overview.

OverallHeatmap = FALSE

if (OverallHeatmap){
# Find differential marker genes in across different clusters 
seurat_markers <- Seurat::FindAllMarkers(seurat_combi, min.pct = 0.2, test.use = "bimod", verbose = FALSE) %>%
    group_by(cluster) %>%
    subset(gene %in% unlist(exist_list))
# Plot the heatmap
(heatmap_feature <- DoHeatmap(seurat_combi, features = unique(seurat_markers$gene), group.colors = pal_simpsons()(16)))
}

Pseudo bulk analysis for annotation

# Create SingleCellExperiment object for easier manipulation
logcounts <- GetAssayData(seurat_combi)
seurat_sce <- SingleCellExperiment(assays = logcounts, 
                                   colData = seurat_combi@meta.data, 
                                   rowData = rownames(seurat_combi))
assayNames(seurat_sce) <- "logcounts"
# Find the average expression of each gene in each cluster
sce_cluster <- muscat::aggregateData(seurat_sce, "logcounts", by=c("seurat_clusters"), fun="mean")
SummarizedExperiment::assayNames(sce_cluster) <- "logcounts"

# Assign a new metadata called cell type to assign the markers on them
rowData(sce_cluster)$cellType <- NA
rowData(sce_cluster)[unlist(exist_list),"cellType"] <- rep(names(exist_list),lengths(exist_list))
# Visualizing the heatmap data of the expression value
(heatmap_ClusterOverview <- plotSpecificHeatmap(sce_cluster, exist_list))

# Return average expression of genes in each cell type in each cluster
sce_cellType <- assay(sce_cluster)[unlist(exist_list),]
sce_cellType <- aggregate(t(scale(t(sce_cellType))), 
                 by=list(type=rep(names(exist_list), lengths(exist_list))), 
                 FUN=mean)
sce_cellType <- as.data.table(sce_cellType)

# for each column (cluster), we select the row (cell type) which has the maximum aggregated value
top_number <- 4
sce_cellType <- sce_cellType[, lapply(.SD, function(x){
                            Best_n <- sort(x, index.return=TRUE, decreasing=TRUE)
                            type[Best_n$ix[1:top_number]]
                          }), .SDcols = colnames(sce_cellType)[-1]]
sce_cellType
##             0   1   2   3   4   5   6          7   8          9         10
## 1:        HSM HSM DAM WAM HSM HSM CAM        IFM IFM Cell_Cycle        SAM
## 2:        SAM WAM CAM DAM PGM PGM IRM        HSM PGM        SAM        IRM
## 3: Cell_Cycle IRM PGM ATM IRM WAM PGM        SAM IRM        HSM        ATM
## 4:        CAM DAM ATM PGM ATM IRM WAM Cell_Cycle WAM        ATM Cell_Cycle
##            11
## 1:        CAM
## 2:        IFM
## 3: Cell_Cycle
## 4:        HSM
acronym <- TRUE
# Assign automatic cell type based on the highest expressed gene

cellType_auto <- if (acronym == TRUE) unlist(sce_cellType[1,]) else meta_data[unlist(sce_cellType[1,])]
label_auto <- paste0(cellType_auto, " (", colnames(sce_cellType), ")")
# After tuning the results, need to change every time you change some parameters
cellType_calibrated <- cellType_auto
if (rb == 15 & mt == 7) {
  # clustering resolution 0.6
  cellType_calibrated[c(8,9)] <- if (acronym==TRUE) c("IFM-I", "IFM-II") else 
                                      c("Inflammatory microglia-I", "Inflammatory microglia-II")
  cellType_calibrated[c(3)]   <- if (acronym==TRUE) c("PAM") else c("Pre-active microglia")

} else if (rb == 10 & mt == 7 | rb == 10 & mt == 5) {
  cellType_calibrated[c(9,10)] <- if (acronym==TRUE) c("IFM-I", "IFM-II") else 
                                      c("Inflammatory microglia-I", "Inflammatory microglia-II")
  cellType_calibrated[c(4)]   <- if (acronym==TRUE) c("PAM") else c("Pre-active microglia")
  cellType_calibrated[c(5)]   <- if (acronym==TRUE) c("DAM") else c("Disease-Associated Microglia")
  cellType_calibrated[c(12)]  <- if (acronym==TRUE) c("SAM") else c("Senescence-associated microglia")

}

label_calibrated <- paste0(cellType_calibrated, " (", colnames(sce_cellType), ")")
# we convert the cells' cluster labels to cell type labels:
seurat_sce$label_auto <- cellType_auto[seurat_sce$seurat_clusters]
seurat_sce$label_cali <- cellType_calibrated[seurat_sce$seurat_clusters]

# we aggregate again to pseudo-bulk using the new clusters
sce_labelAuto <- aggregateData(seurat_sce, "logcounts", by=c("label_auto"), fun="mean")
sce_labelCali <- aggregateData(seurat_sce, "logcounts", by=c("label_cali"), fun="mean")
# we plot again the expression of the markers as a sanity check
assayNames(sce_labelAuto) <- "logcounts"
assayNames(sce_labelCali) <- "logcounts"
seurat_combi$label_auto <- cellType_auto[seurat_sce$seurat_clusters]
seurat_combi$label_calibrated <- cellType_calibrated[seurat_sce$seurat_clusters]

if (saving){
  saveRDS(seurat_combi, "Annotated_seurat_harmony.rds")
}
plot_labelAuto <- DimPlot(seurat_combi, group.by = "label_auto",pt.size = 0.3, cols = pal_npg("nrc")(10), label = TRUE) + 
  plot_annotation(title="Automatic cluster labels") + NoAxes()

plot_labelCali <- DimPlot(seurat_combi, group.by = "label_calibrated",pt.size = 0.3, cols = pal_futurama("planetexpress")(12), label = TRUE) + 
  plot_annotation(title="Calibrated cluster labels") + NoAxes()

(plot_UMAPlabel <- plot_labelAuto + plot_labelCali)

Visualization and result storage

(hmCluster <- showOverallHeatmap(sce_cluster, markerList = exist_list, metaData = meta_data))

(hmClusterGene <- showOverallHeatmap(sce_cluster, markerList = exist_list, metaData = meta_data, clusterRow = TRUE))

(hmLabelAuto <- showOverallHeatmap(sce_labelAuto, markerList = exist_list, metaData = meta_data))

(hmLabelCali <- showOverallHeatmap(sce_labelCali, markerList = exist_list, metaData = meta_data))

if (saving == TRUE){
  pdf(file.path(working_path, "Visualization/Overall_Heatmaps.pdf"), width = 15, height = 15)
  print(hmCluster)
  plot.new()
  print(hmClusterGene)
  plot.new()
  print(hmLabelAuto)
  plot.new()
  print(hmLabelCalis)
  plot.new()
  dev.off()
}
(heatmap_cluIRM <- plotSpecificHeatmap(sce_cluster, exist_list, "IRM"))

(heatmap_cluATMDAM <- plotSpecificHeatmap(sce_cluster, exist_list, c("ATM","DAM")))

(heatmap_labATMDAM <- plotSpecificHeatmap(sce_labelAuto, exist_list, c("ATM","DAM")))

if (saving == TRUE){
  pdf(file.path(working_path, "Visualization/DAM_IRM_ITM_name.pdf"), width = 15, height = 15)
  print(heatmap_showrowname_1)
  print(heatmap_showrowname_2)
  dev.off()
}
links <- data.table(table(source=unlist(paste0("Cluster_", seurat_combi$seurat_clusters)), 
                             target=unlist(seurat_combi$label_calibrated))) %>% 
  .[N != 0,] %>% .[order(-N),]


nodes <- data.frame(
  name=c(as.character(links$source), as.character(links$target)) %>% 
    unique()
)

links$IDsource <- match(links$source, nodes$name)-1 
links$IDtarget <- match(links$target, nodes$name)-1

my_color <- "d3.scaleOrdinal() .domain([' Cluster_0 ',' Cluster_1 ',' Cluster_2 ',' Cluster_3 ',' Cluster_4 ',' Cluster_5 ',' Cluster_6 ',' Cluster_7 ',' Cluster_8 ',' Cluster_9 ',' Cluster_10 ',' Cluster_11 ',' CAM ',' Cell_Cycle ',' DAM ',' HSM ',' IFM-I ',' IFM-II ',' PAM ',' SAM ']) .range([' #FED439FF ',' #709AE1FF ',' #8A9197FF ',' #D2AF81FF ',' #FD7446FF ',' #D5E4A2FF ',' #197EC0FF ',' #F05C3BFF ',' #46732EFF ',' #71D0F5FF ',' #370335FF ',' #075149FF ',' #FF6F00FF ',' #C71000FF ',' #008EA0FF ',' #8A4198FF ',' #5A9599FF ',' #FF6348FF ',' #84D7E1FF ',' #FF95A8FF '])"


(sankeyGraph <- sankeyNetwork(Links = links, Nodes = nodes,
                   Source = "IDsource", Target = "IDtarget",
                   Value = "N", NodeID = "name", 
                   sinksRight=FALSE, fontSize = 12, 
                   fontFamily = "Helvetica", 
                   colourScale = my_color))
topMatClu <- findTopGenes(sce_cluster, markerlist = exist_list)
topMatLab <- findTopGenes(sce_labelCali, markerlist = exist_list)

if (saving == TRUE) {
  write.csv(top_matrix, file.path(working_path,sprintf("TOP_%s_gene_per_matrix.csv", top_number)))
}
seurat_combi$orig.ident[which(seurat_combi$orig.ident  == "GpnmbKO_1" | seurat_combi$orig.ident  == "GpnmbKO_2")] <- "GpnmbKO"

(plot_cellTypeCompAuto <- plotDataComposition(seruatObject = seurat_combi, classLabel = "label_auto", dataLabel = "orig.ident"))

(plot_cellTypeCompCali <- plotDataComposition(seruatObject = seurat_combi, classLabel = "label_calibrated", dataLabel = "orig.ident"))

if (saving == TRUE){
  # for adding the graph
  suppressPackageStartupMessages(library(gridExtra))
  ClusterTable <- data.table::rbindlist(list(sce_cellType, topMatClu))
  CellTypeTable <- topMatLab
  pdf(sprintf("Visualization/Visualization_RB%s_MT%s.pdf", rb, mt), width = 16, height = 8)
  print(plot_UMAP)
  print(plot_labelAuto + plot_labelCali + plot_annotation("Automated vs. Calibrated cell type annotation"))
  print(heatmap_ClusterOverview)
  
  plot.new()
  text(x = 0, y = 1, adj = c(0,1), labels = "Heatmap clustering the gene")
  print(hmClusterGene)
  
  plot.new()
  text(x = 0, y = 1, adj = c(0,1), labels = "Heatmap for each cluster")
  print(hmCluster)
  
  plot.new()
  text(x = 0, y = 1, adj = c(0,1), labels = "Heatmap for each automatically annotated cell type")
  print(hmLabelAuto)
  
  plot.new()
  text(x = 0, y = 1, adj = c(0,1), labels = "Heatmap for each manually calibrated cell type")
  print(hmLabelCali)
  
  print(plot_cellTypeCompAuto + plot_cellTypeCompCali + 
          plot_annotation("Condition composition in automated vs. calibrated cell type annotation"))
  
  grid::grid.newpage()
  grid::grid.text("most likely cell type in each cluster",x = (0.5), y = (0.8))
  grid.table(sce_cellType)
  
  grid::grid.newpage()
  grid::grid.text("most expressed genes in each cluster",x = (0.5), y = (0.9))
  grid.table(topMatClu)
  
  grid::grid.newpage()
  grid::grid.text("most expressed genes in each calibrated cell type",x = (0.5), y = (0.9))
  grid.table(topMatLab)
  
  dev.off()
}

Gene specific plotting (marker list)

For differentially expressed genes, find out their expression profile across different clusters

flist_1 <- c("Cd63","Ctsb","Bax","Cd9","Cd68")
flist_2 <- c("P2ry12","P2ry13","Hexb","Selplg","Tmem119")
flist_3 <- c("Nfkbia","Egr1","Zfp36","Nfkbiz","Ier2","Atf3","Ier3","Apoe", "Trem2", "Tyrobp")
flist_4 <- c("Cd74","H2-Eb1","H2-aa","Cxcl2","Pf4","H2-Ab1")
flist_5 <- c("Top2a","Ccl2","Cxcl10","Hmgb2","Mki67")
flist_6 <- c("Cxcl3","Ifitm3","Slfn5","Ccl12","Ifi204","Stat1","Ifit3","Ifit2","Ccl4","Il1b","Rtp4","Ifitmp","Cst7","Ccl3")
flist_7 <- c("H3f3b","Hsp90aa1","Hsp90ab1","Hsp90b1","Hspa5","Hspa8","Jun","Junb","Malat1","Rps16","Zfp36l1","Fos","Egr1")
flist_8 <- c("Cdkn2a","Cdkn1a","Cdkn2d","Casp8","Glb1")

heatmapmarkersFinal <- c("P2ry12","Cx3cr1","P2ry13","Tmem119","Csf1r","Hexb","Mrc1","Ms4a7","Pf4","Cd9","Trem2","Spp1","Itgax","Cd83","Ifit1","Ifit2","Isg15","Ifitm3","Ccl3","Ccl4","Cst7","Il1b","Rtp4","Cd68","Ctsd","Lamp1","Usp18","Cdk1","Mki67","Birc5","Cd36","Cd74","H2-Aa","Cdkn2a","Cdkn1a","Cdkn2d","Casp8","Il1b","Glb1","Serpine1","Il1rl1") #HOXB8 not in the gene list

feature_list <- lapply(list(flist_1, flist_2, flist_3, flist_4, flist_5, flist_6, flist_7, flist_8), 
                       function(x) {x[x %in% rownames(seurat_combi)]})
colLabel <- c(rep("label", ncol(sce_labelCali)), rep("cluster", ncol(sce_cluster)))
sce_combined <- SingleCellExperiment(cbind(assay(sce_labelCali), assay(sce_cluster)), 
                                     rowData = rowData(sce_labelCali), colData = colLabel)

hmap_feature <- plotSpecificHeatmap(sce_combined, feature_list, gaps_col = "X")
hmap_marker  <- plotSpecificHeatmap(sce_combined, list(Markers=heatmapmarkersFinal), gaps_col = "X")

if (saving == TRUE) {
  pdf(file.path(working_path, "Visualization/Heatmap_Markers_Calibrated.pdf"), width = 17, height = 10)
  print(hmap_feature)
  print(hmap_marker)
  dev.off()
}
VP_clu_list <- plotMarkerList(seurat_combi, feature_list, "seurat_clusters", plot_type = "Vln", saving = saving)
VP_ano_list <- plotMarkerList(seurat_combi, feature_list, "label_auto", plot_type = "Vln", saving = saving)
RP_clu_list <- plotMarkerList(seurat_combi, feature_list, "seurat_clusters", plot_type = "Rdg", saving = saving)
RP_ano_list <- plotMarkerList(seurat_combi, feature_list, "label_auto", plot_type = "Rdg", saving = saving)
HM_clu_list <- plotMarkerList(sce_cluster,  feature_list, "seurat_clusters",plot_type = "Htm", saving = saving)
HM_ano_list <- plotMarkerList(sce_labelAuto, feature_list, "label_auto", plot_type = "Htm", saving = saving)
# Select a random gene for demonstration
featured_gene <- "P2ry13"

# Vlnplot
VPClu_gene <- VlnPlot(seurat_combi, features = featured_gene, 
                   group.by = "seurat_clusters", cols = pal_simpsons()(16),pt.size = 0) + 
            xlab("Clusters") 

VPLab_gene <- VlnPlot(seurat_combi, features = featured_gene, 
                   group.by = "label_calibrated", cols = pal_futurama("planetexpress")(12) ,pt.size = 0) + 
            xlab("Cell type") 
# ridge plot
RPClu_gene <- RidgePlot(seurat_combi, features = featured_gene, group.by = "seurat_clusters", 
                     cols = pal_simpsons()(16), sort = FALSE, stack = FALSE)  + 
            xlab("Expression value")

RPLab_gene <- RidgePlot(seurat_combi, features = featured_gene, group.by = "label_calibrated", 
                     cols = pal_futurama("planetexpress")(12), sort = FALSE, stack = FALSE)  + 
            xlab("Expression value") 

(Plot_gene <- (VPClu_gene | VPLab_gene) / (RPClu_gene | RPLab_gene))

# Dimplot for expression level analysis
color.features <- viridis_pal(option = "C", direction = 1 )(100)
(DP_gene <- FeaturePlot(seurat_combi, features=unlist(exist_list)[1:2], ncol=2, pt.size = 0.5, cols = color.features) & NoAxes())
presentMarker <- c("Trem2","Cd33","Apoe","Sorl1","Mef2c","Spp1","Il1rap","H2-Eb1","Clu","Tyrobp","Ccl3","Ccl4","Ifit1","Myd88","Mtor","Cx3cr1","Sirpa","Mafb","Igf1")

(presentHeatmap <- plotMarkerList(countObject = sce_labelCali, feature_list = presentMarker, 
                                 plot_type = "Htm", group_var = "orig.ident"))
## [[1]]

presentVln <- VlnPlot(seurat_combi, features = presentMarker, group.by = "label_calibrated", cols = pal_simpsons()(16),pt.size = 0, ncol = 10) 
presentRdg <- RidgePlot(seurat_combi, features = presentMarker, group.by = "label_calibrated", cols =  pal_simpsons()(16), ncol = 10) 

if (saving == TRUE) {
  pdf("Visualization/Marker_Vln_ident.pdf", width = 24, height = 20)
  print(presentVln)
  print(presentRdg)
  dev.off()

  pdf("Visualization/Marker_Rdg_ident.pdf", width = 30, height = 8)
  print(presentRdg)
  dev.off()  
}

Epilog

Session info

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.2.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pheatmap_1.0.12             sechm_1.8.0                
##  [3] patchwork_1.1.3             viridis_0.6.3              
##  [5] viridisLite_0.4.2           networkD3_0.4              
##  [7] ggsignif_0.6.4              ggsci_3.0.0                
##  [9] ggplot2_3.4.2               data.table_1.14.8          
## [11] dplyr_1.1.2                 muscat_1.14.0              
## [13] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.1
## [15] Biobase_2.60.0              GenomicRanges_1.52.0       
## [17] GenomeInfoDb_1.36.0         IRanges_2.34.0             
## [19] S4Vectors_0.38.1            BiocGenerics_0.45.3        
## [21] MatrixGenerics_1.12.0       matrixStats_0.63.0         
## [23] harmony_0.1.1               Rcpp_1.0.10                
## [25] SeuratObject_4.1.4          Seurat_4.3.0.1             
## [27] readxl_1.4.3               
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.sparse_3.0-2     bitops_1.0-7             
##   [3] httr_1.4.6                RColorBrewer_1.1-3       
##   [5] doParallel_1.0.17         numDeriv_2016.8-1.1      
##   [7] tools_4.3.0               sctransform_0.4.0        
##   [9] backports_1.4.1           utf8_1.2.3               
##  [11] R6_2.5.1                  lazyeval_0.2.2           
##  [13] uwot_0.1.16               GetoptLong_1.0.5         
##  [15] withr_2.5.0               sp_2.0-0                 
##  [17] prettyunits_1.1.1         gridExtra_2.3            
##  [19] progressr_0.13.0          cli_3.6.1                
##  [21] spatstat.explore_3.2-3    TSP_1.2-4                
##  [23] labeling_0.4.2            sass_0.4.6               
##  [25] mvtnorm_1.2-3             spatstat.data_3.0-1      
##  [27] blme_1.0-5                ggridges_0.5.4           
##  [29] pbapply_1.7-2             scater_1.29.4            
##  [31] parallelly_1.35.0         limma_3.56.1             
##  [33] rstudioapi_0.14           generics_0.1.3           
##  [35] shape_1.4.6               gtools_3.9.4             
##  [37] ica_1.0-3                 spatstat.random_3.1-6    
##  [39] Matrix_1.6-1.1            ggbeeswarm_0.7.2         
##  [41] fansi_1.0.4               abind_1.4-5              
##  [43] lifecycle_1.0.3           yaml_2.3.7               
##  [45] edgeR_3.42.2              gplots_3.1.3             
##  [47] Rtsne_0.16                grid_4.3.0               
##  [49] promises_1.2.0.1          crayon_1.5.2             
##  [51] miniUI_0.1.1.1            lattice_0.21-8           
##  [53] beachmat_2.16.0           cowplot_1.1.1            
##  [55] pillar_1.9.0              knitr_1.42               
##  [57] ComplexHeatmap_2.16.0     rjson_0.2.21             
##  [59] boot_1.3-28.1             future.apply_1.10.0      
##  [61] codetools_0.2-19          leiden_0.4.3             
##  [63] glue_1.6.2                V8_4.3.3                 
##  [65] vctrs_0.6.2               png_0.1-8                
##  [67] Rdpack_2.5                cellranger_1.1.0         
##  [69] gtable_0.3.3              cachem_1.0.8             
##  [71] xfun_0.39                 rbibutils_2.2.15         
##  [73] S4Arrays_1.0.6            mime_0.12                
##  [75] survival_3.5-5            seriation_1.4.2          
##  [77] iterators_1.0.14          ellipsis_0.3.2           
##  [79] fitdistrplus_1.1-11       ROCR_1.0-11              
##  [81] nlme_3.1-162              pbkrtest_0.5.2           
##  [83] bit64_4.0.5               progress_1.2.2           
##  [85] EnvStats_2.8.1            RcppAnnoy_0.0.21         
##  [87] bslib_0.4.2               TMB_1.9.6                
##  [89] irlba_2.3.5.1             vipor_0.4.5              
##  [91] KernSmooth_2.23-20        colorspace_2.1-0         
##  [93] ggrastr_1.0.2             DESeq2_1.40.2            
##  [95] tidyselect_1.2.0          bit_4.0.5                
##  [97] curl_5.0.0                compiler_4.3.0           
##  [99] BiocNeighbors_1.18.0      hdf5r_1.3.8              
## [101] DelayedArray_0.26.2       plotly_4.10.2            
## [103] scales_1.2.1              caTools_1.18.2           
## [105] remaCor_0.0.16            lmtest_0.9-40            
## [107] stringr_1.5.0             digest_0.6.31            
## [109] goftest_1.2-3             spatstat.utils_3.0-3     
## [111] minqa_1.2.6               variancePartition_1.30.2 
## [113] rmarkdown_2.21            ca_0.71.1                
## [115] aod_1.3.2                 XVector_0.40.0           
## [117] RhpcBLASctl_0.23-42       htmltools_0.5.5          
## [119] pkgconfig_2.0.3           lme4_1.1-34              
## [121] sparseMatrixStats_1.12.0  highr_0.10               
## [123] fastmap_1.1.1             rlang_1.1.1              
## [125] GlobalOptions_0.1.2       htmlwidgets_1.6.2        
## [127] shiny_1.7.4               DelayedMatrixStats_1.22.0
## [129] farver_2.1.1              jquerylib_0.1.4          
## [131] zoo_1.8-12                jsonlite_1.8.4           
## [133] BiocParallel_1.34.1       BiocSingular_1.16.0      
## [135] RCurl_1.98-1.12           magrittr_2.0.3           
## [137] scuttle_1.9.4             GenomeInfoDbData_1.2.10  
## [139] munsell_0.5.0             reticulate_1.32.0        
## [141] stringi_1.7.12            zlibbioc_1.46.0          
## [143] MASS_7.3-59               plyr_1.8.8               
## [145] randomcoloR_1.1.0.1       parallel_4.3.0           
## [147] listenv_0.9.0             ggrepel_0.9.3            
## [149] deldir_1.0-9              splines_4.3.0            
## [151] tensor_1.5                hms_1.1.3                
## [153] circlize_0.4.15           locfit_1.5-9.7           
## [155] igraph_1.4.2              spatstat.geom_3.2-5      
## [157] reshape2_1.4.4            ScaledMatrix_1.8.1       
## [159] evaluate_0.21             nloptr_2.0.3             
## [161] foreach_1.5.2             httpuv_1.6.9             
## [163] RANN_2.6.1                tidyr_1.3.0              
## [165] purrr_1.0.1               polyclip_1.10-6          
## [167] future_1.32.0             clue_0.3-64              
## [169] scattermore_1.2           rsvd_1.0.5               
## [171] broom_1.0.5               xtable_1.8-4             
## [173] later_1.3.1               tibble_3.2.1             
## [175] lmerTest_3.1-3            glmmTMB_1.1.7            
## [177] registry_0.5-1            beeswarm_0.4.0           
## [179] cluster_2.1.4             globals_0.16.2